Diet (dis)-similarity

Author

Max Lindmark

Published

2024-12-10

Load packages

home <- here::here()

# Load libraries
library(tidyverse)
library(tidylog)
library(RCurl)
library(gllvm)
library(RColorBrewer)
library(ggrepel)
library(vegan)
library(patchwork)
library(RCurl)
library(devtools)
library(ggpubr)
library(janitor)
library(brms)
library(modelr)
library(tidybayes)
library(ggstats)
library(broom)
library(broom.mixed)

# Source map-plot
source_url("https://raw.githubusercontent.com/maxlindmark/cod-interactions/main/R/functions/map-plot.R")
#source(paste0(home, "R/functions/map-plot.R"))

# Source code for rotating ordination plot (to make it work in ggplot)
source_url("https://raw.githubusercontent.com/maxlindmark/cod-interactions/main/R/functions/rotate.R")
#source(paste0(home, "R/functions/rotate.R"))

Read data

d <- read_csv(paste0(home, "/data/clean/aggregated_stomach_data.csv"))

Diet similarity using bar plots and ordination

Ordination using gllvm

d2 <- d |>
  dplyr::select(ends_with("_tot"))

d2$pred_id <- d$pred_id

d2 <- d2 |>
  left_join(d |> dplyr::select(pred_length_cm, pred_weight_g, pred_id, species), by = "pred_id")
left_join: added 3 columns (pred_length_cm, pred_weight_g, species)
           > rows only in x       0
           > rows only in y  (    0)
           > matched rows     9,864
           >                 =======
           > rows total       9,864
# Calculate relative prey weight and average by size-class
d3 <- d2 |> 
  pivot_longer(ends_with("_tot")) |> 
  mutate(value = value/pred_weight_g) |> 
  filter(pred_length_cm >= 10 & pred_length_cm <= 50) |> # These are the sizes we work with
  mutate(pred_length_cm = cut_width(pred_length_cm, 2),
         pred_length_cm = str_remove(pred_length_cm, "[(]")) |> 
  separate_wider_delim(pred_length_cm, delim = ",", names = c("pred_length_cm", "scrap")) |> 
  mutate(pred_length_cm = ifelse(pred_length_cm == "[9", "9", pred_length_cm),
         pred_length_cm = as.numeric(pred_length_cm)) |> 
  dplyr::select(-scrap) |> 
  group_by(species, pred_length_cm, name) |> 
  summarise(tot_prey_weight = mean(value)) |>
  ungroup() |> 
  mutate(name = str_replace(name, "_", " "),
         name = str_replace(name, "_", " "),
         name = str_remove(name, " tot"),
         name = str_to_title(name)) |> 
  filter(!(species == "Flounder" & pred_length_cm %in% c(45, 43, 39, 9)))
pivot_longer: reorganized (amphipoda_tot, bivalvia_tot, clupeidae_tot, clupea_harengus_tot, gadus_morhua_tot, …) into (name, value) [was 9864x19, now 147960x6]
filter: removed 6,915 rows (5%), 141,045 rows remaining
summarise: now 585 rows and 4 columns, 2 group variables remaining (species, pred_length_cm)
ungroup: no grouping variables
filter: removed 60 rows (10%), 525 rows remaining
#Sample size per 2 cm class?
# d3 |> filter(species == "Flounder") |> distinct(pred_length_cm) |> arrange(pred_length_cm)
#d2 |> pivot_longer(ends_with("_tot")) |> summarise(n = n(), .by = pred_id) |> distinct(n)
# t <- d2 |>
#   pivot_longer(ends_with("_tot")) |>
#   mutate(value = value/pred_weight_g) |>
#   filter(pred_length_cm >= 10 & pred_length_cm <= 50) |> # These are the sizes we work with
#   mutate(pred_length_cm = cut_width(pred_length_cm, 2),
#          pred_length_cm = str_remove(pred_length_cm, "[(]")) |>
#   separate_wider_delim(pred_length_cm, delim = ",", names = c("pred_length_cm", "scrap")) |>
#   mutate(pred_length_cm = ifelse(pred_length_cm == "[9", "9", pred_length_cm),
#          pred_length_cm = as.numeric(pred_length_cm)) |>
#   dplyr::select(-scrap) |>
#   filter(!(species == "Flounder" & pred_length_cm %in% c(45, 43, 39, 9))) |> 
#   summarise(n = n(), .by = c(species, pred_length_cm)) |>
#   mutate(n = n/15) |>
#   arrange(n)
# t
# summary(t)
# ggplot(t, aes(pred_length_cm, n, color = species)) + 
#   geom_point()

d3 |> filter(tot_prey_weight > 0) |> arrange(tot_prey_weight)
filter: removed 121 rows (23%), 404 rows remaining
# A tibble: 404 × 4
   species  pred_length_cm name            tot_prey_weight
   <chr>             <dbl> <chr>                     <dbl>
 1 Cod                  35 Non Bio             0.000000199
 2 Cod                  47 Amphipoda           0.000000251
 3 Cod                  45 Polychaeta          0.000000261
 4 Cod                  41 Bivalvia            0.000000340
 5 Flounder             33 Mysidae             0.000000366
 6 Cod                  43 Amphipoda           0.000000371
 7 Cod                  43 Polychaeta          0.000000480
 8 Cod                  45 Bivalvia            0.000000833
 9 Flounder             25 Clupea Harengus     0.00000102 
10 Cod                  47 Bivalvia            0.00000137 
# ℹ 394 more rows
d_wide <- d3 |> pivot_wider(names_from = name, values_from = tot_prey_weight)
pivot_wider: reorganized (name, tot_prey_weight) into (Amphipoda, Bivalvia, Clupea Harengus, Clupeidae, Gadus Morhua, …) [was 525x4, now 35x17]
d_mat <- d_wide |>
  dplyr::select(-species, -pred_length_cm) |>
  as.matrix()

str(d_mat)
 num [1:35, 1:15] 0.000329 0.000684 0.000535 0.000535 0.0004 ...
 - attr(*, "dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:15] "Amphipoda" "Bivalvia" "Clupea Harengus" "Clupeidae" ...
# Fit gllvm model with two latend variables and no covariates, meaning it becomes an ordination
set.seed(999)

fit <- gllvm(y = d_mat, family = "tweedie", num.lv = 2)
Note that, the tweedie family is implemented using the extended variational approximation method. 
fit
Call: 
gllvm(y = d_mat, family = "tweedie", num.lv = 2)
family: 
[1] "tweedie"
method: 
[1] "VA"

log-likelihood:  2542.686 
Residual degrees of freedom:  466 
AIC:  -4967.372 
AICc:  -4952.146 
BIC:  -4715.831 
# Let's do a ggplot-residual instead
res <- residuals(fit)
p <- NCOL(fit$y)
sppind <- 1:p
ds_res <- res$residuals[, sppind]

# Compare with built-in plot from gllvm: plot(fit, which = 2)
ds_res |> 
  as.data.frame() |> 
  pivot_longer(cols = everything()) |> 
  ggplot(aes(sample = value)) + 
  geom_qq(size = 0.8) + 
  geom_qq_line() + 
  labs(y = "Sample quantiles",
       x = "Theoretical quantiles")
pivot_longer: reorganized (Amphipoda, Bivalvia, Clupea Harengus, Clupeidae, Gadus Morhua, …) into (name, value) [was 35x15, now 525x2]

ggsave(paste0(home, "/figures/supp/qq_ordi_gllvm.pdf"), width = 11, height = 11, units = "cm")

# Plot with GGplot
# Now make the ordination plot. We want to use ggplot, so need to hack it a bit
# See this thread: https://github.com/JenniNiku/gllvm/discussions/116
# ordiplot(fit, biplot = TRUE)

# Extract site scores
LVs = as.data.frame(getLV(fit))

# Bind LV site scores to metadata
# LVs = cbind(LVs, sim.meta)
# 
# ordiplot(fittw, biplot = TRUE)
# 
# ordiplot(fittw, biplot = TRUE, symbols = TRUE, spp.colors = "white")
# 
LVs2 <- rotate(fit)
 
# text?
labs <- LVs2$species |>
  as.data.frame() |> 
  rownames_to_column(var = "prey")

dd <- LVs2$sites |>
  as.data.frame() |> 
  bind_cols(d_wide |> dplyr::select(species, pred_length_cm)) |> 
  mutate(group = "Flounder",
         group = ifelse(species == "Cod" & pred_length_cm <=  25, "Small cod", group),
         group = ifelse(species == "Cod" & pred_length_cm > 25, "Large cod", group))

# Alternative palettes for the groups
pal <- (brewer.pal(n = 11, name = "RdYlBu")[c(11, 4, 1)])
#pal <- (brewer.pal(n = 11, name = "RdYlGn")[c(11, 4, 1)])
#pal <- (brewer.pal(n = 8, name = "Dark2")[c(8, 7, 6)])
#pal <- brewer.pal(n = 8, name = "Dark2")[c(2, 7, 6)]

ordi <- ggplot(dd, aes(x = V1, y = V2, color = factor(group, levels = c("Flounder", "Small cod", "Large cod")),
                       size = pred_length_cm)) +
  geom_point(alpha = 0.6) + 
  # stat_ellipse(aes(V1, V2, color = group), 
  #              inherit.aes = FALSE, linewidth = 1, alpha = 0.3) + 
  scale_radius(range = c(0.8, 4)) +
  scale_color_manual(values = pal) +
  scale_fill_manual(values = pal) + 
  labs(size = "Predator length [cm]", shape = "Species", fill = "Group", color = "",
       x = "Latent variable 1", y = "Latent variable 2") +
  geom_label_repel(data = labs, aes(V1, V2, label = prey),
                   color = "grey20", inherit.aes = FALSE, size = 2, alpha = 0.4,
                   box.padding = 0.25) +
  theme_sleek() +
  coord_cartesian(#xlim = c(-3, 3),
                  ylim = c(-2.5, 2.5)) + 
  guides(color = guide_legend(ncol = 4),
         size = guide_legend(ncol = 4)) + 
  theme(legend.position = "bottom",
        legend.box = "vertical",
        legend.margin = margin(-0.3, 0, 0, 0, unit = "cm"))

ordi

Make a plot of the ontogenetic development of diets

# Calculate relative prey weight and average by size-class
d3 <- d2 |> 
  pivot_longer(ends_with("_tot")) |> 
  mutate(value = value/pred_weight_g) |> 
  group_by(species, pred_length_cm, name) |> 
  summarise(tot_prey_weight = mean(value)) |>
  ungroup() |> 
  mutate(name = str_replace(name, "_", " "),
         name = str_replace(name, "_", " "),
         name = str_remove(name, " tot"),
         name = str_to_title(name)) #|> 
pivot_longer: reorganized (amphipoda_tot, bivalvia_tot, clupeidae_tot, clupea_harengus_tot, gadus_morhua_tot, …) into (name, value) [was 9864x19, now 147960x6]
summarise: now 1,545 rows and 4 columns, 2 group variables remaining (species, pred_length_cm)
ungroup: no grouping variables
  #filter(tot_prey_weight < quantile(tot_prey_weight, probs = 0.995))

max_size_cod <- 65

cod_important_prey <- d3 |>
  filter(species == "Cod") |> 
  mutate(pred_length_cm2 = ifelse(pred_length_cm > max_size_cod, max_size_cod -1, pred_length_cm)) |> 
  mutate(predator_length_grp = cut(pred_length_cm2, breaks = seq(0, 100, by = 5))) |> 
  group_by(name, predator_length_grp) |>
  summarise(prey_group_tot = sum(tot_prey_weight)) |> 
  ungroup() |> 
  group_by(predator_length_grp) |> 
  mutate(prop = prey_group_tot / sum(prey_group_tot)) |> 
  ungroup() |>
  mutate(max_size = as.numeric(substr(predator_length_grp, 5, 6)),
         max_size = ifelse(predator_length_grp == "(0,5]", 5, max_size),
         max_size = ifelse(predator_length_grp == "(5,10]", 10, max_size),
         predator = "Cod")
filter: removed 510 rows (33%), 1,035 rows remaining
summarise: now 195 rows and 3 columns, one group variable remaining (name)
ungroup: no grouping variables
ungroup: no grouping variables
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `max_size = as.numeric(substr(predator_length_grp, 5, 6))`.
Caused by warning:
! NAs introduced by coercion
max_size_fle <- 40

fle_important_prey <- d3 |>
  filter(species == "Flounder") |> 
  mutate(pred_length_cm2 = ifelse(pred_length_cm > max_size_fle, max_size_fle-1, pred_length_cm)) |> 
  mutate(predator_length_grp = cut(pred_length_cm2, breaks = seq(0, 100, by = 5))) |> 
  group_by(name, predator_length_grp) |>
  summarise(prey_group_tot = sum(tot_prey_weight)) |> 
  ungroup() |> 
  group_by(predator_length_grp) |> 
  mutate(prop = prey_group_tot / sum(prey_group_tot)) |> 
  ungroup() |>
  mutate(max_size = as.numeric(substr(predator_length_grp, 5, 6)),
         max_size = ifelse(predator_length_grp == "(0,5]", 5, max_size),
         max_size = ifelse(predator_length_grp == "(5,10]", 10, max_size),
         predator = "Flounder")
filter: removed 1,035 rows (67%), 510 rows remaining
summarise: now 105 rows and 3 columns, one group variable remaining (name)
ungroup: no grouping variables
ungroup: no grouping variables
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `max_size = as.numeric(substr(predator_length_grp, 5, 6))`.
Caused by warning:
! NAs introduced by coercion
area_plot <- bind_rows(fle_important_prey, cod_important_prey) |> 
  mutate(prop = replace_na(prop, 0))

n_cat <- nrow(area_plot |> distinct(name))
distinct: removed 285 rows (95%), 15 rows remaining
colourCount <- n_cat
getPalette <- colorRampPalette(brewer.pal(12, "Paired"))
pal2 <- getPalette(colourCount)

area_plot |> distinct(name)
distinct: removed 285 rows (95%), 15 rows remaining
# A tibble: 15 × 1
   name              
   <chr>             
 1 Amphipoda         
 2 Bivalvia          
 3 Clupea Harengus   
 4 Clupeidae         
 5 Gadus Morhua      
 6 Gobiidae          
 7 Mysidae           
 8 Non Bio           
 9 Other             
10 Other Crustacea   
11 Other Pisces      
12 Platichthys Flesus
13 Polychaeta        
14 Saduria Entomon   
15 Sprattus Sprattus 
# Dataframes for geom_text with sample size
n_cod <- d2 |>
  filter(species == "Cod") |> 
  mutate(pred_length_cm2 = ifelse(pred_length_cm > max_size_cod, max_size_cod -1, pred_length_cm)) |> 
  mutate(predator_length_grp = cut(pred_length_cm2, breaks = seq(0, 100, by = 5))) |> 
  group_by(predator_length_grp) |>
  summarise(n = length(unique(pred_id)))
filter: removed 3,882 rows (39%), 5,982 rows remaining
summarise: now 13 rows and 2 columns, ungrouped
n_fle <- d2 |>
  filter(species == "Flounder") |> 
  mutate(pred_length_cm2 = ifelse(pred_length_cm > max_size_fle, max_size_fle-1, pred_length_cm)) |> 
  mutate(predator_length_grp = cut(pred_length_cm2, breaks = seq(0, 100, by = 5))) |> 
  group_by(predator_length_grp) |>
  summarise(n = length(unique(pred_id)))
filter: removed 5,982 rows (61%), 3,882 rows remaining
summarise: now 7 rows and 2 columns, ungrouped
n_dat <- bind_rows(n_cod |> mutate(predator = "Cod"), 
                   n_fle |> mutate(predator = "Flounder")) |> 
  mutate(max_size = as.numeric(substr(predator_length_grp, 5, 6)),
         max_size = ifelse(predator_length_grp == "(0,5]", 5, max_size),
         max_size = ifelse(predator_length_grp == "(5,10]", 10, max_size))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `max_size = as.numeric(substr(predator_length_grp, 5, 6))`.
Caused by warning:
! NAs introduced by coercion
area_plot <- area_plot |> 
  mutate(name = str_to_sentence(name))

bar_diet <- 
  ggplot(data = area_plot, aes(x = max_size, y = prop, fill = name, color = name)) +
  geom_col(width = 4.3) +
  geom_text(data = n_dat, aes(x = max_size, y = 1.08, label = n), inherit.aes = FALSE,
            size = 0, color = "white") +
  geom_text(data = n_dat, aes(x = max_size, y = 1.04, label = n), inherit.aes = FALSE,
            size = 2) +
  facet_wrap(~predator, scales = "free") +
  scale_fill_manual(values = pal2, name = "") +
  scale_color_manual(values = pal2, name = "") +
  coord_cartesian(expand = 0) +
  scale_x_continuous(breaks = seq(0, 100, 5)) +
  labs(y = "Proportion", x = "Max. predator size in group [cm]") +
  theme(legend.key.size = unit(0.2, 'cm'),
        #legend.text = element_text(face = "italic"),
        legend.position = "bottom",
        legend.margin = margin(-0.3, 0, 0, 0, unit = "cm"))

Combine plots

bar_diet / ordi + plot_annotation(tag_levels = "a") #+ plot_layout(heights = c(1, 1.6))

ggsave(paste0(home, "/figures/ordi_diet.pdf"), width = 17, height = 21, units = "cm")

Schoener’s overlap index

# Calculate relative prey weight and average by size-class
qyear_rect_sum <- d2 |> 
  pivot_longer(ends_with("_tot")) |> 
  filter(pred_length_cm > 10 & pred_length_cm <= 50) |> # These are the sizes we work with
  left_join(d |> dplyr::select(year, quarter, subdiv, ices_rect, X, Y, pred_id), by = "pred_id") |> 
  mutate(group = "Flounder",
         group = ifelse(species == "Cod" & pred_length_cm <=  25, "Small cod", group),
         group = ifelse(species == "Cod" & pred_length_cm > 25, "Large cod", group)) |> 
  group_by(year, quarter, ices_rect, group) |>
  summarise(n = length(unique(pred_id)),
            tot_prey = sum(value)) |> 
  ungroup() |> 
  group_by(year, quarter, ices_rect) |> 
  mutate(min_stom = min(n)) |> 
  ungroup() |>
  mutate(id = paste(year, quarter, ices_rect, group, sep = "_")) |> 
  dplyr::select(-year, -quarter, -ices_rect, -group) |> 
  filter(n > 3)
pivot_longer: reorganized (amphipoda_tot, bivalvia_tot, clupeidae_tot, clupea_harengus_tot, gadus_morhua_tot, …) into (name, value) [was 9864x19, now 147960x6]
filter: removed 7,695 rows (5%), 140,265 rows remaining
left_join: added 6 columns (year, quarter, subdiv, ices_rect, X, …)
           > rows only in x         0
           > rows only in y  (    513)
           > matched rows     140,265
           >                 =========
           > rows total       140,265
summarise: now 384 rows and 6 columns, 3 group variables remaining (year, quarter, ices_rect)
ungroup: no grouping variables
ungroup: no grouping variables
filter: removed 51 rows (13%), 333 rows remaining
diet_prop <- d2 |> 
  pivot_longer(ends_with("_tot")) |> 
  filter(pred_length_cm > 10 & pred_length_cm < 50) |> # These are the sizes we work with
  left_join(d |> dplyr::select(year, quarter, subdiv, ices_rect, X, Y, pred_id), by = "pred_id") |> 
  mutate(group = "Flounder",
         group = ifelse(species == "Cod" & pred_length_cm <=  25, "Small cod", group),
         group = ifelse(species == "Cod" & pred_length_cm > 25, "Large cod", group)) |> 
  group_by(year, quarter, ices_rect, group, name) |>
  summarise(tot_prey_by_grp = sum(value)) |> 
  ungroup() |>
  mutate(id = paste(year, quarter, ices_rect, group, sep = "_")) |> 
  filter(id %in% unique(qyear_rect_sum$id)) |> 
  left_join(qyear_rect_sum, by = "id") |> 
  mutate(prop_prey = tot_prey_by_grp / tot_prey)
pivot_longer: reorganized (amphipoda_tot, bivalvia_tot, clupeidae_tot, clupea_harengus_tot, gadus_morhua_tot, …) into (name, value) [was 9864x19, now 147960x6]
filter: removed 8,295 rows (6%), 139,665 rows remaining
left_join: added 6 columns (year, quarter, subdiv, ices_rect, X, …)
           > rows only in x         0
           > rows only in y  (    553)
           > matched rows     139,665
           >                 =========
           > rows total       139,665
summarise: now 5,760 rows and 6 columns, 4 group variables remaining (year, quarter, ices_rect, group)
ungroup: no grouping variables
filter: removed 765 rows (13%), 4,995 rows remaining
left_join: added 3 columns (n, tot_prey, min_stom)
           > rows only in x       0
           > rows only in y  (    0)
           > matched rows     4,995
           >                 =======
           > rows total       4,995
# Calculate overlap by year and ices rect and species group. Make the proportions wide!
# unique(diet_prop$group)
diet_prop_wide <- diet_prop |> 
  dplyr::select(-tot_prey_by_grp, -tot_prey, -id, -n) |> 
  pivot_wider(names_from = group, values_from = prop_prey) |> 
  mutate(abs_f_sc = abs(Flounder - `Small cod`),
         abs_f_lc = abs(Flounder - `Large cod`),
         abs_lc_sc = abs(`Large cod` - `Small cod`)) |> 
  group_by(year, quarter, ices_rect) |> 
  summarise(`Flounder\nSmall cod` = 1 - 0.5*sum(abs_f_sc),
            `Flounder\nLarge cod` = 1 - 0.5*sum(abs_f_lc),
            `Small cod\nLarge cod` = 1 - 0.5*sum(abs_lc_sc))
pivot_wider: reorganized (group, prop_prey) into (Flounder, Large cod, Small cod) [was 4995x7, now 2145x8]
summarise: now 143 rows and 6 columns, 2 group variables remaining (year, quarter)
diet_prop_wide$latitude <- mapplots::ices.rect(diet_prop_wide$ices_rect)$lat
diet_prop_wide$longitude <- mapplots::ices.rect(diet_prop_wide$ices_rect)$lon

diet_prop_wide <- sdmTMB::add_utm_columns(diet_prop_wide)
Warning: Multiple UTM zones detected.
Proceeding with the most common value.
You may wish to choose a different projection.
Proceeding with UTM zone 33N; CRS = 32633.
Visit https://epsg.io/32633 to verify.
ovr <- diet_prop_wide |>
  pivot_longer(c(`Flounder\nSmall cod`, `Flounder\nLarge cod`, `Small cod\nLarge cod`),
               names_to = "overlap_group", values_to = "overlap") |> 
  drop_na(overlap)
pivot_longer: reorganized (Flounder
Small cod, Flounder
Large cod, Small cod
Large cod) into (overlap_group, overlap) [was 143x10, now 429x9]
drop_na (grouped): removed 166 rows (39%), 263 rows remaining
summary(ovr$overlap)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.02823 0.10553 0.18426 0.25716 0.97485 
# Plot diet overlap
# plot_map +
#   geom_sf(color = "gray80") + 
#   geom_point(data = ovr, aes(X*1000, Y*1000, color = overlap),
#              size = 10) + 
#   viridis::scale_color_viridis() +
#   geom_sf() + 
#   facet_wrap(~overlap_group, ncol = 2) + 
#   NULL
  
set.seed(99)
ps <- ggplot(ovr, aes(overlap_group, overlap, color = ices_rect)) + 
  geom_jitter(height = 0, width = 0.1, alpha = 0.3) + 
  geom_boxplot(aes(overlap_group, overlap),
               inherit.aes = FALSE, fill = NA, width = 0.2, alpha = 0.2, size = 0.6) + 
  viridis::scale_color_viridis(discrete = TRUE, name = "ICES\nrectangle") + 
  geom_hline(yintercept = 0.6, linetype = 2, alpha = 0.6) + 
  labs(y = "Schoener's overlap index",
       x = "") + 
  theme(legend.position = "bottom",
        legend.key.size = unit(0.01, "cm"),
        legend.text = element_text(size = 7),
        legend.title = element_text(size = 8),
        legend.margin = margin(-0.3, 0, 0, 0, unit = "cm"))

ps

# summary(ovr$overlap)
# 
ovr |> filter(overlap == 0)
filter (grouped): removed 259 rows (98%), 4 rows remaining
# A tibble: 4 × 9
# Groups:   year, quarter [4]
   year quarter ices_rect latitude longitude     X     Y overlap_group   overlap
  <dbl>   <dbl> <chr>        <dbl>     <dbl> <dbl> <dbl> <chr>             <dbl>
1  2018       1 42G6          56.8      16.5  592. 6291. "Flounder\nLar…       0
2  2021       1 42G6          56.8      16.5  592. 6291. "Small cod\nLa…       0
3  2021       4 43G6          57.2      16.5  591. 6346. "Flounder\nLar…       0
4  2022       4 42G6          56.8      16.5  592. 6291. "Flounder\nLar…       0

How does the diet overlap relate to biomass density of the species?

Trawl data

## Read in trawl data. There's one file for cod and one for flounder
# They go up to 2020 Q1
# trawl_surveys_cod <- read.csv("data/stomach-data/BITS/trawl/Trawl Surveys Zincl (L) COD.csv", sep = ";")
# trawl_surveys_fle <- read.csv("data/stomach-data/BITS/trawl/Trawl Surveys Zincl (L) FLE.csv", sep = ";")

trawl_surveys_cod <- read_delim(paste0(home, "/data/stomach-data/BITS/trawl/Trawl Surveys Zincl (L) COD.csv"), delim = ";")
Rows: 9199 Columns: 35
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr  (10): Species, Index, Appr. status, Validity, Station name, ICES, Dist....
dbl  (13): Year, Quarter, Month, Seqno, Haul, Subsam, Processing, Size, Subd...
num   (7): Lat, Long, Bottom depth, Doorspread, Opening, Tot. no./hour, No./...
lgl   (4): Sex, Headropedepth, DNA sampleID, NumberToSample
date  (1): Date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trawl_surveys_fle <- read_delim(paste0(home, "/data/stomach-data/BITS/trawl/Trawl Surveys Zincl (L) FLE.csv"), delim = ";")
Rows: 7306 Columns: 35
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr  (10): Species, Index, Appr. status, Validity, Station name, ICES, Dist....
dbl  (13): Year, Quarter, Month, Seqno, Haul, Subsam, Processing, Size, Subd...
num   (7): Lat, Long, Bottom depth, Doorspread, Opening, Tot. no./hour, No./...
lgl   (4): Sex, Headropedepth, DNA sampleID, NumberToSample
date  (1): Date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Combine for the two species, filter and clean!
trawl_data <- rbind(trawl_surveys_cod, trawl_surveys_fle) |>
  clean_names() |>
  mutate(swept_area = as.numeric(gsub(",", "\\.", swept_area)),
         kg_hour = as.numeric(gsub(",", "\\.", kg_hour)),
         dist = as.numeric(gsub(",", "\\.", dist))) |> 
  as.data.frame()

str(trawl_data)
'data.frame':   16505 obs. of  35 variables:
 $ species            : chr  "cod" "cod" "cod" "cod" ...
 $ date               : Date, format: "2015-02-26" "2015-02-26" ...
 $ year               : num  2015 2015 2015 2015 2015 ...
 $ quarter            : num  1 1 1 1 1 1 1 1 1 1 ...
 $ month              : num  2 2 2 2 2 2 2 2 2 2 ...
 $ index              : chr  "OXBH.2015.2" "OXBH.2015.2" "OXBH.2015.2" "OXBH.2015.2" ...
 $ appr_status        : chr  "Locked" "Locked" "Locked" "Locked" ...
 $ seqno              : num  10048 10048 10048 10048 10048 ...
 $ haul               : num  2 2 2 2 2 2 2 2 2 2 ...
 $ validity           : chr  "V" "V" "V" "V" ...
 $ station_name       : chr  "SLAGGENABBEN" "SLAGGENABBEN" "SLAGGENABBEN" "SLAGGENABBEN" ...
 $ subsam             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ processing         : num  0 0 0 0 0 0 0 0 0 0 ...
 $ size               : num  9 9 9 9 9 9 9 9 9 9 ...
 $ sex                : logi  NA NA NA NA NA NA ...
 $ subdiv             : num  25 25 25 25 25 25 25 25 25 25 ...
 $ rect               : num  3959 3959 3959 3959 3959 ...
 $ ices               : chr  "39G4" "39G4" "39G4" "39G4" ...
 $ lat                : num  55289 55289 55289 55289 55289 ...
 $ long               : num  143628 143628 143628 143628 143628 ...
 $ bottom_depth       : num  603 603 603 603 603 603 603 603 603 603 ...
 $ duration           : num  30 30 30 30 30 30 30 30 30 30 ...
 $ dist               : num  1.49 1.49 1.49 1.49 1.49 1.49 1.49 1.49 1.49 1.49 ...
 $ doorspread         : num  875 875 875 875 875 875 875 875 875 875 ...
 $ opening            : num  56 56 56 56 56 56 56 56 56 56 ...
 $ headropedepth      : logi  NA NA NA NA NA NA ...
 $ swept_area         : num  0.483 0.483 0.483 0.483 0.483 0.483 0.483 0.483 0.483 0.483 ...
 $ kg_hour            : num  1606 1606 1606 1606 1606 ...
 $ tot_no_hour        : num  57467 57467 57467 57467 57467 ...
 $ lengthcl           : num  200 210 220 230 240 250 260 270 280 290 ...
 $ no_hour            : num  552 368 736 184 1472 ...
 $ length_measure_type: chr  "Total length TL" "Total length TL" "Total length TL" "Total length TL" ...
 $ dna_sample_id      : logi  NA NA NA NA NA NA ...
 $ number_to_sample   : logi  NA NA NA NA NA NA ...
 $ smhi_serial_no     : num  1 1 1 1 1 1 1 1 1 1 ...
sort(unique(trawl_data$lat)) |> head(20)
 [1] 5541 5542 5608 5613 5619 5629 5632 5634 5703 5706 5712 5717 5721 5722 5723
[16] 5728 5729 5743 5746 5750
# For these coordinates, we can use the function Fede provided:
format.position <- function(x){
  sign.x <- sign(x)
  x <- abs(x)
  x <- ifelse(nchar(x)==3, paste("0",x,sep=""), x)
  x <- ifelse(nchar(x)==2, paste("00",x,sep=""), x)
  x <- ifelse(nchar(x)==1, paste("000",x,sep=""), x)
  dec.x <- as.numeric(paste(substring(x,1,2)))+as.numeric(paste(substring(x,3,4)))/60
  dec.x <- sign.x*dec.x
}

# Apply function
trawl_data$lat <- format.position(trawl_data$lat)
trawl_data$lon <- format.position(trawl_data$long)

trawl_data <- sdmTMB::add_utm_columns(trawl_data, ll_names = c("lon", "lat"))
Warning: Multiple UTM zones detected.
Proceeding with the most common value.
You may wish to choose a different projection.
Proceeding with UTM zone 33N; CRS = 32633.
Visit https://epsg.io/32633 to verify.
# SMHI serial no?
t <- trawl_data |> drop_na(smhi_serial_no)
drop_na: removed 2,390 rows (14%), 14,115 rows remaining
plot_map + 
  geom_point(data = trawl_data, aes(X*1000, Y*1000, color = factor(year)),
             alpha = 0.5, size = 0.3) +
  theme_sleek(base_size = 8)
Warning: `guide_colourbar()` needs continuous scales.
Warning: Removed 75 rows containing missing values or values outside the scale range
(`geom_point()`).

Read and join the biologial data

trawl_surveys_s_cod <- read_delim(paste0(home, "/data/stomach-data/BITS/biological/Trawl Surveys (S) COD.csv"), delim = ";")
Rows: 10381 Columns: 33
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr   (9): Appr. status, Sampling type, Validity, Station name, ICES, Specie...
dbl  (22): Year, Quarter, Month, Trip No, Seqno, Haul, Subsam, Processing, S...
lgl   (1): Maturity No (SMSF)
date  (1): Date  

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trawl_surveys_s_flounder <- read_delim(paste0(home, "/data/stomach-data/BITS/biological/Trawl Surveys (S) FLE.csv"), delim = ";")
Rows: 13946 Columns: 33
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr   (9): Appr. status, Sampling type, Validity, Station name, ICES, Specie...
dbl  (20): Year, Quarter, Month, Trip No, Seqno, Haul, Subsam, Processing, S...
lgl   (3): Age quality, Age grading, Maturity No (SMSF)
date  (1): Date  

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trawl_surveys_s_cod$Species <- "Cod"
trawl_surveys_s_flounder$Species <- "Flounder"

bio_data <- bind_rows(trawl_surveys_s_cod, trawl_surveys_s_flounder) |>
  clean_names()

# Join the trawl, bio and stomach data. First create a unique ID.
# In earlier versions I had a column called otolith number (fish ID!), which was really fish id, but it isn't here anymore.

# Add a new species code in the trawl data that matches the stomach data 
trawl_data <- trawl_data |> mutate(predator_code = ifelse(species == "cod", "COD", "FLE"))

bio_data <- bio_data |> mutate(predator_code = ifelse(species == "Cod", "COD", "FLE"))

unique(is.na(trawl_data[, c("year", "quarter", "haul")]))
      year quarter  haul
[1,] FALSE   FALSE FALSE
# Go ahead
trawl_data$haul_id <- paste(trawl_data$year,
                            trawl_data$quarter,
                            trawl_data$haul,
                            sep = "_")

# Should be a unique ID per length and predator code
trawl_data |> 
  group_by(haul_id, lengthcl, predator_code) |> 
  mutate(n = n()) |> 
  ungroup() |> 
  distinct(n)
ungroup: no grouping variables
distinct: removed 16,504 rows (>99%), one row remaining
# A tibble: 1 × 1
      n
  <int>
1     1
# Now we want the cpue of a species-size combination as a column, then make it distinct per haul
trawl_data_unique_haul <- trawl_data |> 
  dplyr::select(-species, -lengthcl, -predator_code, -kg_hour, -tot_no_hour, -no_hour, -length_measure_type, -sex, -long) |> # Remove any column that has anything to do with catch, because that info should come from the species dataframes below. I.e., after left_joining this and the catch data, any 0 zero catches should be NA in the joined data
  distinct(haul_id, .keep_all = TRUE)
distinct: removed 15,851 rows (96%), 654 rows remaining
trawl_data_cod <- trawl_data |> 
  filter(!validity == "I") |> 
  filter(predator_code == "COD") |> 
  mutate(kg = kg_hour * (duration / 60),
         kg_km2 = kg / swept_area) |> # make it biomass density
  drop_na(kg_km2) |> 
  mutate(size_group = ifelse(lengthcl >= 100 & lengthcl < 250, "small", NA),
         size_group = ifelse(lengthcl >= 250 & lengthcl < 500, "medium", size_group)) |> 
  group_by(haul_id, size_group) |> 
  summarise(kg_km2 = sum(kg_km2)) |> 
  filter(size_group %in% c("small", "medium")) |> 
  pivot_wider(names_from = size_group, values_from = kg_km2) |> 
  rename(mcod_kg_km2 = medium,
         scod_kg_km2 = small) |> 
  mutate(mcod_kg_km2 = replace_na(mcod_kg_km2, 0),
         scod_kg_km2 = replace_na(scod_kg_km2, 0))
filter: removed 32 rows (<1%), 16,473 rows remaining
filter: removed 7,290 rows (44%), 9,183 rows remaining
drop_na: removed 169 rows (2%), 9,014 rows remaining
summarise: now 1,130 rows and 3 columns, one group variable remaining (haul_id)
filter (grouped): removed 253 rows (22%), 877 rows remaining
pivot_wider: reorganized (size_group, kg_km2) into (medium, small) [was 877x3, now 468x3]
rename: renamed 2 variables (mcod_kg_km2, scod_kg_km2)
# Ok, so because this is catches (no hauls with no fish... the only reason I have 0 catches is if one of the size-groups doesn't have a catch!)

# Check with a haul_id
trawl_data |> 
  filter(predator_code == "COD" & haul_id == "2015_1_20") 
filter: removed 16,500 rows (>99%), 5 rows remaining
  species       date year quarter month        index appr_status seqno haul
1     cod 2015-02-28 2015       1     2 OXBH.2015.20      Locked 10057   20
2     cod 2015-02-28 2015       1     2 OXBH.2015.20      Locked 10057   20
3     cod 2015-02-28 2015       1     2 OXBH.2015.20      Locked 10057   20
4     cod 2015-02-28 2015       1     2 OXBH.2015.20      Locked 10057   20
5     cod 2015-02-28 2015       1     2 OXBH.2015.20      Locked 10057   20
  validity     station_name subsam processing size sex subdiv rect ices
1        V 11 S HOBURG BANK      0          0    9  NA     26 4163 41G8
2        V 11 S HOBURG BANK      0          0    9  NA     26 4163 41G8
3        V 11 S HOBURG BANK      0          0    9  NA     26 4163 41G8
4        V 11 S HOBURG BANK      0          0    9  NA     26 4163 41G8
5        V 11 S HOBURG BANK      0          0    9  NA     26 4163 41G8
       lat   long bottom_depth duration dist doorspread opening headropedepth
1 56.38333 182938          377       30  1.5        849       6            NA
2 56.38333 182938          377       30  1.5        849       6            NA
3 56.38333 182938          377       30  1.5        849       6            NA
4 56.38333 182938          377       30  1.5        849       6            NA
5 56.38333 182938          377       30  1.5        849       6            NA
  swept_area kg_hour tot_no_hour lengthcl no_hour length_measure_type
1      0.472   4.508          10      260       2     Total length TL
2      0.472   4.508          10      330       2     Total length TL
3      0.472   4.508          10      350       2     Total length TL
4      0.472   4.508          10      400       2     Total length TL
5      0.472   4.508          10      410       2     Total length TL
  dna_sample_id number_to_sample smhi_serial_no      lon        X        Y
1            NA               NA             19 18.48333 715.0414 6254.191
2            NA               NA             19 18.48333 715.0414 6254.191
3            NA               NA             19 18.48333 715.0414 6254.191
4            NA               NA             19 18.48333 715.0414 6254.191
5            NA               NA             19 18.48333 715.0414 6254.191
  predator_code   haul_id
1           COD 2015_1_20
2           COD 2015_1_20
3           COD 2015_1_20
4           COD 2015_1_20
5           COD 2015_1_20
trawl_data_cod
# A tibble: 468 × 3
# Groups:   haul_id [468]
   haul_id   mcod_kg_km2 scod_kg_km2
   <chr>           <dbl>       <dbl>
 1 2015_1_10   19145.         6892. 
 2 2015_1_12    7867.         2503. 
 3 2015_1_14   12351.         6444. 
 4 2015_1_15    3899.         2481. 
 5 2015_1_17    3583.         1792. 
 6 2015_1_2    38232.         8311. 
 7 2015_1_20      23.9           0  
 8 2015_1_21       0.401         0  
 9 2015_1_22     203.           18.4
10 2015_1_25      30.3          10.1
# ℹ 458 more rows
# Now do the same for flounder and join with the cod data, then with haul data!
# Different code because we do not need to worry about size
trawl_data_fle <- trawl_data |> 
  filter(!validity == "I") |> 
  filter(predator_code == "FLE") |> 
  mutate(kg = kg_hour * (duration / 60),
         kg_km2 = kg / swept_area) |> # make it biomass density
  #drop_na(kg_km2) |> 
  mutate(size_group = ifelse(lengthcl >= 100, "benthic", "all")) |> 
  group_by(haul_id, size_group) |> 
  summarise(fle_kg_km2 = sum(kg_km2)) |> 
  filter(size_group == "benthic")
filter: removed 32 rows (<1%), 16,473 rows remaining
filter: removed 9,183 rows (56%), 7,290 rows remaining
summarise: now 642 rows and 3 columns, one group variable remaining (haul_id)
filter (grouped): removed 152 rows (24%), 490 rows remaining
# Add back summaries to dataframe of unique hauls
trawl_data_unique_haul <- trawl_data_unique_haul |> filter(!validity == "I")
filter: removed 16 rows (2%), 638 rows remaining
trawl_data_wide <- left_join(trawl_data_unique_haul, trawl_data_cod, by = "haul_id")
left_join: added 2 columns (mcod_kg_km2, scod_kg_km2)
           > rows only in x   170
           > rows only in y  (  0)
           > matched rows     468
           >                 =====
           > rows total       638
trawl_data_wide <- left_join(trawl_data_wide, trawl_data_fle, by = "haul_id")
left_join: added 2 columns (size_group, fle_kg_km2)
           > rows only in x   148
           > rows only in y  (  0)
           > matched rows     490
           >                 =====
           > rows total       638
# Change the NA's to 0's... 
trawl_data_wide <- trawl_data_wide |> 
  mutate(scod_kg_km2 = replace_na(scod_kg_km2, 0),
         mcod_kg_km2 = replace_na(mcod_kg_km2, 0),
         fle_kg_km2 = replace_na(fle_kg_km2, 0))

# Now add in the same ID in the bio_data
unique(is.na(bio_data[, c("year", "quarter", "haul")]))
      year quarter  haul
[1,] FALSE   FALSE FALSE
# Go ahead
bio_data$haul_id <- paste(bio_data$year,
                          bio_data$quarter,
                          bio_data$haul,
                          sep = "_")
                                
unique(bio_data$haul_id) |> head(20)
 [1] "2015_1_94"  "2022_4_249" "2022_4_278" "2021_1_85"  "2022_1_53" 
 [6] "2018_4_9"   "2016_4_43"  "2017_1_73"  "2017_4_2"   "2017_4_4"  
[11] "2017_4_6"   "2019_4_90"  "2022_1_82"  "2016_1_55"  "2022_1_99" 
[16] "2022_4_246" "2022_4_254" "2022_4_279" "2021_1_80"  "2022_1_57" 
# Now join in trawl data into the bio data (and then into stomach data)
colnames(bio_data)
 [1] "date"                "year"                "quarter"            
 [4] "month"               "trip_no"             "appr_status"        
 [7] "sampling_type"       "seqno"               "haul"               
[10] "validity"            "station_name"        "subsam"             
[13] "processing"          "size"                "max_maturity"       
[16] "subdiv"              "rect"                "ices"               
[19] "species"             "lengthcl"            "fishno"             
[22] "age"                 "age_quality"         "age_grading"        
[25] "weight"              "sex"                 "maturity"           
[28] "maturity_no_smsf"    "stomach_sample_code" "specimen_note"      
[31] "individual_no"       "spec_tsn"            "day_night"          
[34] "predator_code"       "haul_id"            
colnames(trawl_data_wide)
 [1] "date"             "year"             "quarter"          "month"           
 [5] "index"            "appr_status"      "seqno"            "haul"            
 [9] "validity"         "station_name"     "subsam"           "processing"      
[13] "size"             "subdiv"           "rect"             "ices"            
[17] "lat"              "bottom_depth"     "duration"         "dist"            
[21] "doorspread"       "opening"          "headropedepth"    "swept_area"      
[25] "dna_sample_id"    "number_to_sample" "smhi_serial_no"   "lon"             
[29] "X"                "Y"                "haul_id"          "mcod_kg_km2"     
[33] "scod_kg_km2"      "size_group"       "fle_kg_km2"      
# Check first for overlapping columns, and if so, if one of the datasets has any NAs
common_cols <- intersect(colnames(bio_data), colnames(trawl_data_wide))

unique(is.na(trawl_data_wide[, common_cols]))
      date  year quarter month appr_status seqno  haul validity station_name
[1,] FALSE FALSE   FALSE FALSE       FALSE FALSE FALSE    FALSE        FALSE
[2,] FALSE FALSE   FALSE FALSE       FALSE FALSE FALSE    FALSE        FALSE
     subsam processing  size subdiv  rect  ices haul_id
[1,]  FALSE      FALSE FALSE  FALSE FALSE FALSE   FALSE
[2,]   TRUE       TRUE  TRUE  FALSE FALSE FALSE   FALSE
unique(is.na(bio_data[, common_cols]))
      date  year quarter month appr_status seqno  haul validity station_name
[1,] FALSE FALSE   FALSE FALSE       FALSE FALSE FALSE    FALSE        FALSE
     subsam processing  size subdiv  rect  ices haul_id
[1,]  FALSE      FALSE FALSE  FALSE FALSE FALSE   FALSE
# Trawl data has some NA's in the common columns. Select only the important columns
colnames(trawl_data_wide)[!colnames(trawl_data_wide) %in% colnames(bio_data)]
 [1] "index"            "lat"              "bottom_depth"     "duration"        
 [5] "dist"             "doorspread"       "opening"          "headropedepth"   
 [9] "swept_area"       "dna_sample_id"    "number_to_sample" "smhi_serial_no"  
[13] "lon"              "X"                "Y"                "mcod_kg_km2"     
[17] "scod_kg_km2"      "size_group"       "fle_kg_km2"      
trawl_data_wide <- trawl_data_wide |>
  dplyr::select(haul_id, fle_kg_km2, mcod_kg_km2, scod_kg_km2, lat, lon, bottom_depth, X, Y, smhi_serial_no)

bio_data <- left_join(bio_data, trawl_data_wide, by = "haul_id")
left_join: added 9 columns (fle_kg_km2, mcod_kg_km2, scod_kg_km2, lat, lon, …)
           > rows only in x        0
           > rows only in y  (   146)
           > matched rows     24,327
           >                 ========
           > rows total       24,327
names(bio_data)
 [1] "date"                "year"                "quarter"            
 [4] "month"               "trip_no"             "appr_status"        
 [7] "sampling_type"       "seqno"               "haul"               
[10] "validity"            "station_name"        "subsam"             
[13] "processing"          "size"                "max_maturity"       
[16] "subdiv"              "rect"                "ices"               
[19] "species"             "lengthcl"            "fishno"             
[22] "age"                 "age_quality"         "age_grading"        
[25] "weight"              "sex"                 "maturity"           
[28] "maturity_no_smsf"    "stomach_sample_code" "specimen_note"      
[31] "individual_no"       "spec_tsn"            "day_night"          
[34] "predator_code"       "haul_id"             "fle_kg_km2"         
[37] "mcod_kg_km2"         "scod_kg_km2"         "lat"                
[40] "lon"                 "bottom_depth"        "X"                  
[43] "Y"                   "smhi_serial_no"     

Summarise haul data on the ICES rectangle level to join into the diet

# Now calculate spatial overlap in biomass density... 
qyear_rect_sum_biom <- bio_data |> 
  dplyr::select(fle_kg_km2, mcod_kg_km2, scod_kg_km2, ices, year, quarter) |> 
  rename(ices_rect = ices) |> #,
  group_by(year, quarter, ices_rect) |>
  summarise(mean_fle = mean(fle_kg_km2),
            mean_scod = mean(scod_kg_km2),
            mean_mcod = mean(mcod_kg_km2)) 
rename: renamed one variable (ices_rect)
summarise: now 188 rows and 6 columns, 2 group variables remaining (year, quarter)
# Join into diet data
ovr <- ovr |> 
  pivot_wider(names_from = overlap_group, values_from = overlap) |> 
  drop_na() |> 
  left_join(qyear_rect_sum_biom)
pivot_wider: reorganized (overlap_group, overlap) into (Flounder
Small cod, Flounder
Large cod, Small cod
Large cod) [was 263x9, now 113x10]
drop_na (grouped): removed 38 rows (34%), 75 rows remaining
Joining with `by = join_by(year, quarter, ices_rect)`
left_join: added 3 columns (mean_fle, mean_scod, mean_mcod)
> rows only in x 0
> rows only in y (113)
> matched rows 75
> =====
> rows total 75

Fit zero inflated beta regressions using brms

# https://www.andrewheiss.com/blog/2021/11/08/beta-regression-guide/#zero-inflated-beta-regression-bayesian-style

# Full model
ovr <- ovr |>
  ungroup() |> 
  clean_names() |> 
  pivot_longer(c("flounder_small_cod", "flounder_large_cod", "small_cod_large_cod")) |> 
  mutate(value = ifelse(value < 1e-15, 0, value),
         mean_biom_sc = as.numeric(scale(log(mean_fle + mean_scod + mean_mcod) / 3)),
         ) #|> 
ungroup: no grouping variables
pivot_longer: reorganized (flounder_small_cod, flounder_large_cod, small_cod_large_cod) into (name, value) [was 75x13, now 225x12]
  # Scale densities before averaging?
  # mutate(mean_scod2 = log(mean_scod)/max(log(mean_scod)),
  #        mean_fle2 = log(mean_fle)/max(log(mean_fle)),
  #        mean_mcod2 = log(mean_mcod)/max(log(mean_mcod)),
  #        mean_biom_sc2 = as.numeric(scale(mean_scod2 + mean_fle2 + mean_mcod2) / 3))
  
# ggplot(ovr, aes(mean_biom_sc, mean_biom_sc2)) + 
#   geom_point()
# cor(ovr$mean_biom_sc, ovr$mean_biom_sc2)

write_csv(ovr, paste0(paste0(home, "/data/clean/s_overlap.csv")))

zib_model <- bf(
  value ~ name*mean_biom_sc,
  phi ~ name,
  zi ~ name,
  family = zero_inflated_beta()
)

fit <- brm(
  formula = zib_model,
  data = ovr,
  cores = 4,
  chains = 4,
  iter = 4000
)
Compiling Stan program...
Start sampling
fit
 Family: zero_inflated_beta 
  Links: mu = logit; phi = log; zi = logit 
Formula: value ~ name * mean_biom_sc 
         phi ~ name
         zi ~ name
   Data: ovr (Number of observations: 225) 
  Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
         total post-warmup draws = 8000

Population-Level Effects: 
                                     Estimate Est.Error l-95% CI u-95% CI Rhat
Intercept                               -2.50      0.14    -2.77    -2.21 1.00
phi_Intercept                            2.15      0.19     1.77     2.50 1.00
zi_Intercept                            -3.66      0.74    -5.34    -2.43 1.00
nameflounder_small_cod                   0.79      0.19     0.41     1.15 1.00
namesmall_cod_large_cod                  1.71      0.20     1.32     2.09 1.00
mean_biom_sc                             0.11      0.12    -0.12     0.35 1.00
nameflounder_small_cod:mean_biom_sc      0.05      0.16    -0.26     0.36 1.00
namesmall_cod_large_cod:mean_biom_sc    -0.48      0.18    -0.83    -0.12 1.00
phi_nameflounder_small_cod              -0.39      0.25    -0.88     0.11 1.00
phi_namesmall_cod_large_cod             -1.28      0.24    -1.75    -0.80 1.00
zi_nameflounder_small_cod               -0.78      1.28    -3.53     1.54 1.00
zi_namesmall_cod_large_cod              -3.91      3.40   -12.69     0.49 1.00
                                     Bulk_ESS Tail_ESS
Intercept                                4687     4829
phi_Intercept                            4928     5265
zi_Intercept                             9697     6493
nameflounder_small_cod                   5423     5291
namesmall_cod_large_cod                  5841     5757
mean_biom_sc                             5098     5544
nameflounder_small_cod:mean_biom_sc      5367     5910
namesmall_cod_large_cod:mean_biom_sc     6004     6249
phi_nameflounder_small_cod               5675     5797
phi_namesmall_cod_large_cod              5555     6433
zi_nameflounder_small_cod                9463     5151
zi_namesmall_cod_large_cod               3374     2226

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
# https://mc-stan.org/misc/warnings.html
tibble(rhat = rhat(fit),
       par = names(rhat(fit))) |> 
  arrange(desc(rhat)) |> 
  arrange(desc(rhat)) |> 
  as.data.frame()
        rhat                                    par
1  1.0015716                                 lprior
2  1.0008843            b_zi_nameflounder_small_cod
3  1.0008026               b_nameflounder_small_cod
4  1.0005838                            b_Intercept
5  1.0005211           b_zi_namesmall_cod_large_cod
6  1.0004156          b_phi_namesmall_cod_large_cod
7  1.0003372  b_nameflounder_small_cod:mean_biom_sc
8  1.0002853              b_namesmall_cod_large_cod
9  1.0002793                                   lp__
10 1.0002164 b_namesmall_cod_large_cod:mean_biom_sc
11 1.0001093                         b_mean_biom_sc
12 1.0000424                        b_phi_Intercept
13 0.9999678           b_phi_nameflounder_small_cod
14 0.9999411                         b_zi_Intercept
zi_intercept <- tidy(fit, effects = "fixed") |> 
  filter(component == "zi") |> 
  pull(estimate) 
Warning in tidy.brmsfit(fit, effects = "fixed"): some parameter names contain
underscores: term naming may be unreliable!
filter: removed 9 rows (75%), 3 rows remaining
# Transformed to a probability/proportion
plogis(zi_intercept)*100 # percent
              b_zi_Intercept  b_zi_nameflounder_small_cod 
                    2.512214                    31.458065 
b_zi_namesmall_cod_large_cod 
                    1.955686 
# Compare with data
ovr |> 
  count(value == 0) |> 
  mutate(prop = n / sum(n) *100)
count: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 3
  `value == 0`     n  prop
  <lgl>        <int> <dbl>
1 FALSE          222 98.7 
2 TRUE             3  1.33
get_variables(fit)
 [1] "b_Intercept"                           
 [2] "b_phi_Intercept"                       
 [3] "b_zi_Intercept"                        
 [4] "b_nameflounder_small_cod"              
 [5] "b_namesmall_cod_large_cod"             
 [6] "b_mean_biom_sc"                        
 [7] "b_nameflounder_small_cod:mean_biom_sc" 
 [8] "b_namesmall_cod_large_cod:mean_biom_sc"
 [9] "b_phi_nameflounder_small_cod"          
[10] "b_phi_namesmall_cod_large_cod"         
[11] "b_zi_nameflounder_small_cod"           
[12] "b_zi_namesmall_cod_large_cod"          
[13] "lprior"                                
[14] "lp__"                                  
[15] "accept_stat__"                         
[16] "stepsize__"                            
[17] "treedepth__"                           
[18] "n_leapfrog__"                          
[19] "divergent__"                           
[20] "energy__"                              
plot(fit)

# https://discourse.mc-stan.org/t/trying-to-understand-default-prior-for-the-hurdle-part/24337
priors <- prior_summary(fit)
priors
                prior     class                                 coef group resp
               (flat)         b                                                
               (flat)         b                         mean_biom_sc           
               (flat)         b               nameflounder_small_cod           
               (flat)         b  nameflounder_small_cod:mean_biom_sc           
               (flat)         b              namesmall_cod_large_cod           
               (flat)         b namesmall_cod_large_cod:mean_biom_sc           
               (flat)         b                                                
               (flat)         b               nameflounder_small_cod           
               (flat)         b              namesmall_cod_large_cod           
               (flat)         b                                                
               (flat)         b               nameflounder_small_cod           
               (flat)         b              namesmall_cod_large_cod           
 student_t(3, 0, 2.5) Intercept                                                
 student_t(3, 0, 2.5) Intercept                                                
       logistic(0, 1) Intercept                                                
 dpar nlpar lb ub       source
                       default
                  (vectorized)
                  (vectorized)
                  (vectorized)
                  (vectorized)
                  (vectorized)
  phi                  default
  phi             (vectorized)
  phi             (vectorized)
   zi                  default
   zi             (vectorized)
   zi             (vectorized)
                       default
  phi                  default
   zi                  default
stancode(fit)
// generated with brms 2.20.4
functions {
  /* zero-inflated beta log-PDF of a single response
   * Args:
   *   y: the response value
   *   mu: mean parameter of the beta distribution
   *   phi: precision parameter of the beta distribution
   *   zi: zero-inflation probability
   * Returns:
   *   a scalar to be added to the log posterior
   */
   real zero_inflated_beta_lpdf(real y, real mu, real phi, real zi) {
     row_vector[2] shape = [mu * phi, (1 - mu) * phi];
     if (y == 0) {
       return bernoulli_lpmf(1 | zi);
     } else {
       return bernoulli_lpmf(0 | zi) +
              beta_lpdf(y | shape[1], shape[2]);
     }
   }
  /* zero-inflated beta log-PDF of a single response
   * logit parameterization of the zero-inflation part
   * Args:
   *   y: the response value
   *   mu: mean parameter of the beta distribution
   *   phi: precision parameter of the beta distribution
   *   zi: linear predictor for zero-inflation part
   * Returns:
   *   a scalar to be added to the log posterior
   */
   real zero_inflated_beta_logit_lpdf(real y, real mu, real phi, real zi) {
     row_vector[2] shape = [mu * phi, (1 - mu) * phi];
     if (y == 0) {
       return bernoulli_logit_lpmf(1 | zi);
     } else {
       return bernoulli_logit_lpmf(0 | zi) +
              beta_lpdf(y | shape[1], shape[2]);
     }
   }
  // zero-inflated beta log-CCDF and log-CDF functions
  real zero_inflated_beta_lccdf(real y, real mu, real phi, real zi) {
    row_vector[2] shape = [mu * phi, (1 - mu) * phi];
    return bernoulli_lpmf(0 | zi) + beta_lccdf(y | shape[1], shape[2]);
  }
  real zero_inflated_beta_lcdf(real y, real mu, real phi, real zi) {
    return log1m_exp(zero_inflated_beta_lccdf(y | mu, phi, zi));
  }
}
data {
  int<lower=1> N;  // total number of observations
  vector[N] Y;  // response variable
  int<lower=1> K;  // number of population-level effects
  matrix[N, K] X;  // population-level design matrix
  int<lower=1> Kc;  // number of population-level effects after centering
  int<lower=1> K_phi;  // number of population-level effects
  matrix[N, K_phi] X_phi;  // population-level design matrix
  int<lower=1> Kc_phi;  // number of population-level effects after centering
  int<lower=1> K_zi;  // number of population-level effects
  matrix[N, K_zi] X_zi;  // population-level design matrix
  int<lower=1> Kc_zi;  // number of population-level effects after centering
  int prior_only;  // should the likelihood be ignored?
}
transformed data {
  matrix[N, Kc] Xc;  // centered version of X without an intercept
  vector[Kc] means_X;  // column means of X before centering
  matrix[N, Kc_phi] Xc_phi;  // centered version of X_phi without an intercept
  vector[Kc_phi] means_X_phi;  // column means of X_phi before centering
  matrix[N, Kc_zi] Xc_zi;  // centered version of X_zi without an intercept
  vector[Kc_zi] means_X_zi;  // column means of X_zi before centering
  for (i in 2:K) {
    means_X[i - 1] = mean(X[, i]);
    Xc[, i - 1] = X[, i] - means_X[i - 1];
  }
  for (i in 2:K_phi) {
    means_X_phi[i - 1] = mean(X_phi[, i]);
    Xc_phi[, i - 1] = X_phi[, i] - means_X_phi[i - 1];
  }
  for (i in 2:K_zi) {
    means_X_zi[i - 1] = mean(X_zi[, i]);
    Xc_zi[, i - 1] = X_zi[, i] - means_X_zi[i - 1];
  }
}
parameters {
  vector[Kc] b;  // regression coefficients
  real Intercept;  // temporary intercept for centered predictors
  vector[Kc_phi] b_phi;  // regression coefficients
  real Intercept_phi;  // temporary intercept for centered predictors
  vector[Kc_zi] b_zi;  // regression coefficients
  real Intercept_zi;  // temporary intercept for centered predictors
}
transformed parameters {
  real lprior = 0;  // prior contributions to the log posterior
  lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
  lprior += student_t_lpdf(Intercept_phi | 3, 0, 2.5);
  lprior += logistic_lpdf(Intercept_zi | 0, 1);
}
model {
  // likelihood including constants
  if (!prior_only) {
    // initialize linear predictor term
    vector[N] mu = rep_vector(0.0, N);
    // initialize linear predictor term
    vector[N] phi = rep_vector(0.0, N);
    // initialize linear predictor term
    vector[N] zi = rep_vector(0.0, N);
    mu += Intercept + Xc * b;
    phi += Intercept_phi + Xc_phi * b_phi;
    zi += Intercept_zi + Xc_zi * b_zi;
    mu = inv_logit(mu);
    phi = exp(phi);
    for (n in 1:N) {
      target += zero_inflated_beta_logit_lpdf(Y[n] | mu[n], phi[n], zi[n]);
    }
  }
  // priors including constants
  target += lprior;
}
generated quantities {
  // actual population-level intercept
  real b_Intercept = Intercept - dot_product(means_X, b);
  // actual population-level intercept
  real b_phi_Intercept = Intercept_phi - dot_product(means_X_phi, b_phi);
  // actual population-level intercept
  real b_zi_Intercept = Intercept_zi - dot_product(means_X_zi, b_zi);
}
pp_check(fit) +
  coord_cartesian(expand = TRUE) +
  labs(x = "Schoener's overlap index",
       y = "Density") + 
  theme(legend.position.inside = c(0.8, 0.8)) + 
  guides(color = guide_legend(position = "inside"))
Using 10 posterior draws for ppc type 'dens_overlay' by default.
Coordinate system already present. Adding new coordinate system, which will
replace the existing one.

ggsave(paste0(home, "/figures/supp/schoener_zib_pp_check.pdf"), width = 11, height = 11, units = "cm")

conditional_effects(fit)

set.seed(123)

ovr <- ovr |> 
  mutate(name2 = ifelse(name == "flounder_large_cod", "Flounder\nLarge cod", name),
         name2 = ifelse(name == "flounder_small_cod", "Flounder\nSmall cod", name2),
         name2 = ifelse(name == "small_cod_large_cod", "Small cod\nLarge cod", name2))

p1 <- ovr |>
  data_grid(name = unique(name),
            mean_biom_sc = 0) |>
  mutate(name2 = ifelse(name == "flounder_large_cod", "Flounder\nLarge cod", name),
         name2 = ifelse(name == "flounder_small_cod", "Flounder\nSmall cod", name2),
         name2 = ifelse(name == "small_cod_large_cod", "Small cod\nLarge cod", name2)) |> 
  add_epred_draws(fit) |>
  ggplot(aes(.epred, name2)) +
  geom_jitter(data = ovr, aes(value, name2, color = name2), height = 0.2, width = 0, alpha = 0.4) +
  stat_pointinterval(.width = c(.66, .95), color = "gray30") +
  geom_vline(xintercept = 0.6, linetype = 2, alpha = 0.6) +
  scale_color_brewer(palette = "Set1") + 
  labs(x = "Schoener's overlap index", y = "", color = "") +
  geom_stripped_rows(color = NA) +
  coord_cartesian(expand = 0, xlim = c(-0.03, 1)) +
  theme(plot.margin = unit(c(0, 0.3, 0, 0), "cm")) +
  guides(color = "none") +
  NULL

pal <- brewer.pal(name = "RdBu", n = 7)

# Check CI's
fit |>
  spread_draws(b_mean_biom_sc,
               `b_nameflounder_small_cod:mean_biom_sc`,
               `b_namesmall_cod_large_cod:mean_biom_sc`) |>
  mutate("Flounder\nLarge cod" = b_mean_biom_sc,
         "Flounder\nSmall cod" = b_mean_biom_sc + `b_nameflounder_small_cod:mean_biom_sc`,
         "Small cod\nLarge cod" = b_mean_biom_sc + `b_namesmall_cod_large_cod:mean_biom_sc`,
         ) |> 
  summarise(mean = mean(`Small cod\nLarge cod`),
            upr = quantile(`Small cod\nLarge cod`, probs = 0.845))
summarise: now one row and 2 columns, ungrouped
# A tibble: 1 × 2
    mean    upr
   <dbl>  <dbl>
1 -0.365 -0.232
p2 <- fit |>
  spread_draws(b_mean_biom_sc,
               `b_nameflounder_small_cod:mean_biom_sc`,
               `b_namesmall_cod_large_cod:mean_biom_sc`) |>
  mutate("Flounder\nLarge cod" = b_mean_biom_sc,
         "Flounder\nSmall cod" = b_mean_biom_sc + `b_nameflounder_small_cod:mean_biom_sc`,
         "Small cod\nLarge cod" = b_mean_biom_sc + `b_namesmall_cod_large_cod:mean_biom_sc`,
         ) |>
  pivot_longer(c(`Flounder\nLarge cod`, `Flounder\nSmall cod`, `Small cod\nLarge cod`),
               names_to = "name2", values_to = "slope") |> 
  #ggplot(aes(slope, `Overlap group`, fill = after_stat(x < 0))) +
  ggplot(aes(slope, name2, fill = name2, color = name2, alpha = after_stat(x < 0))) +
  scale_fill_brewer(palette = "Set1", name = "") +
  scale_color_brewer(palette = "Set1", name = "") + 
  scale_alpha_manual(values = c(0.4, 0.7)) + 
  ggdist::stat_eye() +
  stat_pointinterval(.width = c(.66, .95), color = "gray10") +
  labs(y = "", x = "Slope of biomass density\non the mean Schoener's diet overlap") +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.6) +
  theme(legend.position.inside = c(0.15, 0.1)) +
  guides(alpha = guide_legend(position = "inside", override.aes = list(alpha = c(0.2, 0.8))),
         fill = "none", color = "none") +
  geom_stripped_rows(aes(slope, name2, fill = name2),
                     inherit.aes = FALSE) +
  coord_cartesian(expand = 0) +
  NULL
pivot_longer: reorganized (Flounder
Large cod, Flounder
Small cod, Small cod
Large cod) into (name2, slope) [was 8000x9, now 24000x8]
Warning in geom_stripped_rows(aes(slope, name2, fill = name2), inherit.aes =
FALSE): Ignoring unknown aesthetics: x and fill
# Conditional
p3 <- ovr |>
  data_grid(name = unique(name),
            mean_biom_sc = seq_range(mean_biom_sc, n = 101)) |>
  mutate(name2 = ifelse(name == "flounder_large_cod", "Flounder\nLarge cod", name),
         name2 = ifelse(name == "flounder_small_cod", "Flounder\nSmall cod", name2),
         name2 = ifelse(name == "small_cod_large_cod", "Small cod\nLarge cod", name2)) |> 
  add_epred_draws(fit) |>
  ggplot(aes(x = mean_biom_sc, y = value, fill = name2, color = name2)) +
  stat_lineribbon(aes(y = .epred), .width = c(.95), alpha = 0.6) +
  geom_point(data = ovr, alpha = 0.5) +
  scale_fill_brewer(palette = "Pastel1") +
  scale_color_brewer(palette = "Set1") + 
  theme(legend.position.inside = c(0.90, 0.84)) + 
  labs(y = "Schoener's overlap index", x = "Scaled cod and flounder mean biomass density",
       fill = "", color = "") + 
  guides(fill = guide_legend(position = "inside"))

p3

pp <- (p1 + p2) + plot_layout(axes = "collect")

pp / p3 +
  plot_annotation(tag_levels = "a")

ggsave(paste0(home, "/figures/schoener_zib.pdf"), width = 17, height = 20, units = "cm")

# What's the actual predictions
sum <- ovr |>
  data_grid(name = unique(name),
            mean_biom_sc = 0) |>
  mutate(name2 = ifelse(name == "flounder_large_cod", "Flounder\nLarge cod", name),
         name2 = ifelse(name == "flounder_small_cod", "Flounder\nSmall cod", name2),
         name2 = ifelse(name == "small_cod_large_cod", "Small cod\nLarge cod", name2)) |> 
  add_epred_draws(fit) |> 
  group_by(name) |> 
  summarise(mean = mean(.epred),
            lwr = quantile(.epred, probs = 0.025),
            upr = quantile(.epred, probs = 0.975))
summarise: now 3 rows and 4 columns, ungrouped
sum
# A tibble: 3 × 4
  name                  mean    lwr    upr
  <chr>                <dbl>  <dbl>  <dbl>
1 flounder_large_cod  0.0741 0.0568 0.0959
2 flounder_small_cod  0.151  0.122  0.185 
3 small_cod_large_cod 0.311  0.258  0.370 
p1 + geom_vline(data = sum, aes(xintercept = mean))

knitr::knit_exit()